import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap
dfMobile = pd.read_csv("Mobile_price_id.csv", sep=';', index_col=0)
dfMobile.head(10)
| battery_power | bluetooth | clock_speed | dual_sim | fc | four_g | int_memory | m_dep | mobile_wt | n_cores | ... | px_height | px_width | ram | sc_h | sc_w | talk_time | three_g | touch_screen | wifi | price_range | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Index | |||||||||||||||||||||
| 0 | 842 | 0.0 | 2.2 | 0 | 1 | 0 | 7 | 0.6 | 188 | 2 | ... | 20 | 756 | 2549 | 9 | 7 | 19 | 0 | 0 | 1 | 1 |
| 1 | 1021 | 1.0 | 0.5 | 1 | 0 | 1 | 53 | 0.7 | 136 | 3 | ... | 905 | 1988 | 2631 | 17 | 3 | 7 | 1 | 1 | 0 | 2 |
| 2 | 563 | 1.0 | 0.5 | 1 | 2 | 1 | 41 | 0.9 | 145 | 5 | ... | 1263 | 1716 | 2603 | 11 | 2 | 9 | 1 | 1 | 0 | 2 |
| 3 | 615 | 1.0 | 2.5 | 0 | 0 | 0 | 10 | 0.8 | 131 | 6 | ... | 1216 | 1786 | 2769 | 16 | 8 | 11 | 1 | 0 | 0 | 2 |
| 4 | 1821 | 1.0 | 1.2 | 0 | 13 | 1 | 44 | 0.6 | 141 | 2 | ... | 1208 | 1212 | 1411 | 8 | 2 | 15 | 1 | 1 | 0 | 1 |
| 5 | 1859 | 0.0 | NaN | 1 | 3 | 0 | 22 | 0.7 | 164 | 1 | ... | 1004 | 1654 | 1067 | 17 | 1 | 10 | 1 | 0 | 0 | 1 |
| 6 | 1821 | NaN | 1.7 | 0 | 4 | 1 | 10 | 0.8 | 139 | 8 | ... | 381 | 1018 | 3220 | 13 | 8 | 18 | 1 | 0 | 1 | 3 |
| 7 | 1954 | 0.0 | 0.5 | 1 | 0 | 0 | 24 | 0.8 | 187 | 4 | ... | 512 | 1149 | 700 | 16 | 3 | 5 | 1 | 1 | 1 | 0 |
| 8 | 1445 | 1.0 | 0.5 | 0 | 0 | 0 | 53 | 0.7 | 174 | 7 | ... | 386 | 836 | 1099 | 17 | 1 | 20 | 1 | 0 | 0 | 0 |
| 9 | 509 | 1.0 | NaN | 1 | 2 | 1 | 9 | 0.1 | 93 | 5 | ... | 1137 | 1224 | 513 | 19 | 10 | 12 | 1 | 0 | 0 | 0 |
10 rows × 21 columns
dfMobile.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2000 entries, 0 to 1999 Data columns (total 21 columns): battery_power 2000 non-null int64 bluetooth 1995 non-null float64 clock_speed 1995 non-null float64 dual_sim 2000 non-null int64 fc 2000 non-null int64 four_g 2000 non-null int64 int_memory 2000 non-null int64 m_dep 2000 non-null float64 mobile_wt 2000 non-null int64 n_cores 2000 non-null int64 pc 2000 non-null int64 px_height 2000 non-null int64 px_width 2000 non-null int64 ram 2000 non-null int64 sc_h 2000 non-null int64 sc_w 2000 non-null int64 talk_time 2000 non-null int64 three_g 2000 non-null int64 touch_screen 2000 non-null int64 wifi 2000 non-null int64 price_range 2000 non-null int64 dtypes: float64(3), int64(18) memory usage: 343.8 KB
We can see that ,there are some missing values. Here "bluetooth" and "clock_speed" columns have respectively 5 missing values. Furthermore, many columns are categorical data but are represented as float data.
So we decided to keep categorical features with numeric features
dfMobile["bluetooth"] = dfMobile["bluetooth"].fillna(dfMobile["bluetooth"].mode()[0])
dfMobile["clock_speed"] = dfMobile["clock_speed"].fillna(round(dfMobile["clock_speed"].mean(),2))
dfMobile.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2000 entries, 0 to 1999 Data columns (total 21 columns): battery_power 2000 non-null int64 bluetooth 2000 non-null float64 clock_speed 2000 non-null float64 dual_sim 2000 non-null int64 fc 2000 non-null int64 four_g 2000 non-null int64 int_memory 2000 non-null int64 m_dep 2000 non-null float64 mobile_wt 2000 non-null int64 n_cores 2000 non-null int64 pc 2000 non-null int64 px_height 2000 non-null int64 px_width 2000 non-null int64 ram 2000 non-null int64 sc_h 2000 non-null int64 sc_w 2000 non-null int64 talk_time 2000 non-null int64 three_g 2000 non-null int64 touch_screen 2000 non-null int64 wifi 2000 non-null int64 price_range 2000 non-null int64 dtypes: float64(3), int64(18) memory usage: 343.8 KB
Now the dataset is processed.
plt.figure(figsize=(20,20), dpi=200)
plt.subplot(4,5,1)
sns.boxplot(x = 'battery_power', data = dfMobile)
plt.subplot(4,5,2)
sns.boxplot(x = 'bluetooth', data = dfMobile)
plt.subplot(4,5,3)
sns.boxplot(x = 'clock_speed', data = dfMobile)
plt.subplot(4,5,4)
sns.boxplot(x = 'dual_sim', data = dfMobile)
plt.subplot(4,5,5)
sns.boxplot(x = 'fc', data = dfMobile)
plt.subplot(4,5,6)
sns.boxplot(x = 'four_g', data = dfMobile)
plt.subplot(4,5,7)
sns.boxplot(x = 'int_memory', data = dfMobile)
plt.subplot(4,5,8)
sns.boxplot(x = 'm_dep', data = dfMobile)
plt.subplot(4,5,9)
sns.boxplot(x = 'mobile_wt', data = dfMobile)
plt.subplot(4,5,10)
sns.boxplot(x = 'n_cores', data = dfMobile)
plt.subplot(4,5,11)
sns.boxplot(x = 'pc', data = dfMobile)
plt.subplot(4,5,12)
sns.boxplot(x = 'px_height', data = dfMobile)
plt.subplot(4,5,13)
sns.boxplot(x = 'px_width', data = dfMobile)
plt.subplot(4,5,14)
sns.boxplot(x = 'ram', data = dfMobile)
plt.subplot(4,5,15)
sns.boxplot(x = 'sc_h', data = dfMobile)
plt.subplot(4,5,16)
sns.boxplot(x = 'sc_w', data = dfMobile)
plt.subplot(4,5,17)
sns.boxplot(x = 'talk_time', data = dfMobile)
plt.subplot(4,5,18)
sns.boxplot(x = 'three_g', data = dfMobile)
plt.subplot(4,5,19)
sns.boxplot(x = 'touch_screen', data = dfMobile)
plt.subplot(4,5,20)
sns.boxplot(x = 'wifi', data = dfMobile)
<matplotlib.axes._subplots.AxesSubplot at 0x16a63e5d940>
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(12,8))
ax = sns.heatmap(dfMobile.corr(), cmap='Greens', annot=True)
plt.title("Correlation Matrix", fontsize=20)
plt.show()
We can see that our label is very correlated with ram column, battery power and the dimension of the mobile
columns = ['battery_power', 'bluetooth', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc','px_height','px_width','ram','sc_h','sc_w','talk_time','three_g','touch_screen','wifi']
x = dfMobile[columns]
label = dfMobile['price_range']
from pandas.plotting import scatter_matrix
column_num=['battery_power','clock_speed','fc','int_memory','m_dep', 'mobile_wt', 'n_cores', 'pc','px_height','px_width','ram','sc_h','sc_w','talk_time','price_range']
dfMobile.hist(column =column_num ,figsize = (17,18));
DfMobile_num=dfMobile[['battery_power','pc','px_height','px_width','ram','price_range']]
sns.pairplot(DfMobile_num, hue="price_range")
<seaborn.axisgrid.PairGrid at 0x16a650b9978>
sns.displot(DfMobile_num, x='ram')
plt.title('Ram Feature', weight='bold')
Text(0.5, 1.0, 'Ram Feature')
sns.lmplot(x='ram', y='price_range', data=DfMobile_num,hue="price_range")
plt.yticks([0, 1, 2, 3])
plt.xlabel('Ram')
plt.ylabel('Price Range')
plt.title('Ram\'s correlation to Price Range', weight='bold')
plt.show()
sns.boxplot(x='price_range', y='battery_power', data=DfMobile_num)
plt.xlabel('Price Range')
plt.ylabel('Battery Power')
plt.title('Battery Power\'s correlation to Price Range', weight='bold')
plt.show()
four_g = dfMobile['four_g'].value_counts()
plt.title('Percentage of Mobiles with 4G', weight='bold')
labels_4g = ['4G', 'No 4G']
four_g.plot.pie(autopct="%.1f%%", labels=labels_4g)
plt.show()
three_g = dfMobile['three_g'].value_counts()
plt.title('Percentage of Mobiles with 3G', weight='bold')
labels_3g = ['3G', 'No 3G']
three_g.plot.pie(autopct="%.1f%%", labels=labels_3g)
plt.show()
n_cores = dfMobile['n_cores'].value_counts()
plt.title('Number of cores in mobile phones\n\n', weight='bold')
n_cores.plot.pie(autopct="%.1f%%", radius=1.5)
plt.show()
sns.boxplot(x='touch_screen',y='price_range',data=dfMobile)
plt.title('touch screen correlation to price range', weight='bold')
plt.show()
sns.boxplot(x='dual_sim',y='price_range',data=dfMobile)
plt.title('dual sim correlation to price range', weight='bold')
plt.show()
Let's apply some unsupervised learning algorithm and see how they perform on this dataset
First let's try PCA to reduce our dimensionality with Kmeans
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data_scaled = scaler.fit_transform(x)
data_scaled
array([[-0.90259726, -0.98708341, 0.83070733, ..., -1.78686097,
-1.00601811, 0.98609664],
[-0.49513857, 1.01308561, -1.25649702, ..., 0.55964063,
0.99401789, -1.01409939],
[-1.5376865 , 1.01308561, -1.25649702, ..., 0.55964063,
0.99401789, -1.01409939],
...,
[ 1.53077336, -0.98708341, -0.76539011, ..., 0.55964063,
0.99401789, -1.01409939],
[ 0.62252745, -0.98708341, -0.76539011, ..., 0.55964063,
0.99401789, 0.98609664],
[-1.65833069, 1.01308561, 0.58515388, ..., 0.55964063,
0.99401789, 0.98609664]])
from sklearn.decomposition import PCA
nb_components = 4
pca = PCA(n_components = nb_components)
pca.fit(data_scaled)
data_pca = pca.transform(data_scaled)
plt.figure(figsize=(8,5))
colnames = columns
hm = sns.heatmap(pca.components_.T, cbar=True, annot=True, cmap='bwr', fmt=".2f",
annot_kws={"size":10}, yticklabels=colnames, vmax=1, vmin=-1, center=0)
plt.xlabel("pca",fontsize=13)
plt.ylabel("variables",fontsize=13)
plt.title("Correlation between variables and pca components",fontsize=18)
Text(0.5, 1.0, 'Correlation between variables and pca components')
plt.figure(figsize=(8,6))
sns.scatterplot(data_pca[:,1], data_pca[:,3], hue=label, legend='full')
plt.xlabel('First principal component')
plt.ylabel('Second Principal Component')
C:\Users\mayou\Anaconda3\lib\site-packages\seaborn\_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning
Text(0, 0.5, 'Second Principal Component')
from sklearn.cluster import KMeans
import numpy as np
X=np.array(list(zip(data_pca[:,1],data_pca[:,3])))
colors= ['b','g','c','k']
markers= ['o','v','h','+']
plt.ylabel('PCA dimension')
kmeans = KMeans(n_clusters=4).fit(X)
plt.scatter(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],s=200,c='red',label='Centroids')
for i, l in enumerate(kmeans.labels_):
plt.plot(data_pca[:,1][i],data_pca[:,3][i],color=colors[l],marker=markers[l])
plt.xlabel(' PCA dimension ')
plt.legend()
plt.show()
Elbow Method to find number of clusters, even if we need 4 clusters like in our target variable
ssd = []
range_n_clusters = [2, 3, 4, 5, 6, 7, 8]
for num_clusters in range_n_clusters:
kkmeans = KMeans(n_clusters=num_clusters, max_iter=1000)
kkmeans.fit(X)
ssd.append(kkmeans.inertia_)
plt.plot(ssd)
[<matplotlib.lines.Line2D at 0x16a6ce12c50>]
Internal Index : Silhouette Score
from sklearn.metrics import silhouette_score
range_n_clusters = [2, 3, 4, 5, 6, 7, 8]
for num_clusters in range_n_clusters:
# intialise kmeans
kameans = KMeans(n_clusters=num_clusters, max_iter=1000)
kameans.fit(X)
cluster_labels = kameans.labels_
# silhouette score
silhouette_avg = silhouette_score(X, cluster_labels)
print("For n_clusters={0}, the silhouette score is {1}".format(num_clusters, silhouette_avg))
For n_clusters=2, the silhouette score is 0.33984636949290237 For n_clusters=3, the silhouette score is 0.3630275884730351 For n_clusters=4, the silhouette score is 0.3392835856384422 For n_clusters=5, the silhouette score is 0.32953005390567713 For n_clusters=6, the silhouette score is 0.34122292301162543 For n_clusters=7, the silhouette score is 0.33222474061129126 For n_clusters=8, the silhouette score is 0.3300991031667386
dfMobile['K-Means_Cluster_ID']=kmeans.labels_
dfMobile.head(100)
| battery_power | bluetooth | clock_speed | dual_sim | fc | four_g | int_memory | m_dep | mobile_wt | n_cores | ... | px_width | ram | sc_h | sc_w | talk_time | three_g | touch_screen | wifi | price_range | K-Means_Cluster_ID | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Index | |||||||||||||||||||||
| 0 | 842 | 0.0 | 2.20 | 0 | 1 | 0 | 7 | 0.6 | 188 | 2 | ... | 756 | 2549 | 9 | 7 | 19 | 0 | 0 | 1 | 1 | 0 |
| 1 | 1021 | 1.0 | 0.50 | 1 | 0 | 1 | 53 | 0.7 | 136 | 3 | ... | 1988 | 2631 | 17 | 3 | 7 | 1 | 1 | 0 | 2 | 2 |
| 2 | 563 | 1.0 | 0.50 | 1 | 2 | 1 | 41 | 0.9 | 145 | 5 | ... | 1716 | 2603 | 11 | 2 | 9 | 1 | 1 | 0 | 2 | 2 |
| 3 | 615 | 1.0 | 2.50 | 0 | 0 | 0 | 10 | 0.8 | 131 | 6 | ... | 1786 | 2769 | 16 | 8 | 11 | 1 | 0 | 0 | 2 | 2 |
| 4 | 1821 | 1.0 | 1.20 | 0 | 13 | 1 | 44 | 0.6 | 141 | 2 | ... | 1212 | 1411 | 8 | 2 | 15 | 1 | 1 | 0 | 1 | 2 |
| 5 | 1859 | 0.0 | 1.52 | 1 | 3 | 0 | 22 | 0.7 | 164 | 1 | ... | 1654 | 1067 | 17 | 1 | 10 | 1 | 0 | 0 | 1 | 2 |
| 6 | 1821 | 0.0 | 1.70 | 0 | 4 | 1 | 10 | 0.8 | 139 | 8 | ... | 1018 | 3220 | 13 | 8 | 18 | 1 | 0 | 1 | 3 | 1 |
| 7 | 1954 | 0.0 | 0.50 | 1 | 0 | 0 | 24 | 0.8 | 187 | 4 | ... | 1149 | 700 | 16 | 3 | 5 | 1 | 1 | 1 | 0 | 3 |
| 8 | 1445 | 1.0 | 0.50 | 0 | 0 | 0 | 53 | 0.7 | 174 | 7 | ... | 836 | 1099 | 17 | 1 | 20 | 1 | 0 | 0 | 0 | 0 |
| 9 | 509 | 1.0 | 1.52 | 1 | 2 | 1 | 9 | 0.1 | 93 | 5 | ... | 1224 | 513 | 19 | 10 | 12 | 1 | 0 | 0 | 0 | 1 |
| 10 | 769 | 0.0 | 2.90 | 1 | 0 | 0 | 9 | 0.1 | 182 | 5 | ... | 874 | 3946 | 5 | 2 | 7 | 0 | 0 | 0 | 3 | 3 |
| 11 | 1520 | 1.0 | 2.20 | 0 | 5 | 1 | 33 | 0.5 | 177 | 8 | ... | 1005 | 3826 | 14 | 9 | 13 | 1 | 1 | 1 | 3 | 1 |
| 12 | 1815 | 0.0 | 2.80 | 0 | 2 | 0 | 33 | 0.6 | 159 | 4 | ... | 748 | 1482 | 18 | 0 | 2 | 1 | 0 | 0 | 1 | 0 |
| 13 | 803 | 1.0 | 2.10 | 0 | 7 | 0 | 17 | 1.0 | 198 | 4 | ... | 1440 | 2680 | 7 | 1 | 4 | 1 | 0 | 1 | 2 | 2 |
| 14 | 1866 | 0.0 | 0.50 | 0 | 13 | 1 | 52 | 0.7 | 185 | 1 | ... | 563 | 373 | 14 | 9 | 3 | 1 | 0 | 1 | 0 | 1 |
| 15 | 775 | 0.0 | 1.00 | 0 | 3 | 0 | 46 | 0.7 | 159 | 2 | ... | 1864 | 568 | 17 | 15 | 11 | 1 | 1 | 1 | 0 | 1 |
| 16 | 838 | 0.0 | 1.52 | 0 | 1 | 1 | 13 | 0.1 | 196 | 8 | ... | 1850 | 3554 | 10 | 9 | 19 | 1 | 0 | 1 | 3 | 2 |
| 17 | 595 | 0.0 | 0.90 | 1 | 7 | 1 | 23 | 0.1 | 121 | 3 | ... | 810 | 3752 | 10 | 2 | 18 | 1 | 1 | 0 | 3 | 1 |
| 18 | 1131 | 0.0 | 0.50 | 1 | 11 | 0 | 49 | 0.6 | 101 | 5 | ... | 878 | 1835 | 19 | 13 | 16 | 1 | 1 | 0 | 1 | 1 |
| 19 | 682 | 1.0 | 0.50 | 0 | 4 | 0 | 19 | 1.0 | 121 | 4 | ... | 1064 | 2337 | 11 | 1 | 18 | 0 | 1 | 1 | 1 | 3 |
| 20 | 772 | 0.0 | 1.10 | 1 | 12 | 0 | 39 | 0.8 | 81 | 7 | ... | 1854 | 2819 | 17 | 15 | 3 | 1 | 1 | 0 | 3 | 1 |
| 21 | 1709 | 1.0 | 2.10 | 0 | 1 | 0 | 13 | 1.0 | 156 | 2 | ... | 1385 | 3283 | 17 | 1 | 15 | 1 | 0 | 0 | 3 | 3 |
| 22 | 1949 | 0.0 | 1.52 | 1 | 4 | 0 | 47 | 0.3 | 199 | 4 | ... | 822 | 1433 | 11 | 5 | 20 | 0 | 0 | 1 | 1 | 0 |
| 23 | 1602 | 1.0 | 2.80 | 1 | 4 | 1 | 38 | 0.7 | 114 | 3 | ... | 788 | 1037 | 8 | 7 | 20 | 1 | 0 | 0 | 0 | 1 |
| 24 | 503 | 0.0 | 1.20 | 1 | 5 | 1 | 8 | 0.4 | 111 | 3 | ... | 1245 | 2583 | 11 | 0 | 12 | 1 | 0 | 0 | 1 | 2 |
| 25 | 961 | 1.0 | 1.40 | 1 | 0 | 1 | 57 | 0.6 | 114 | 8 | ... | 1434 | 2782 | 18 | 9 | 7 | 1 | 1 | 1 | 2 | 1 |
| 26 | 519 | 1.0 | 1.60 | 1 | 7 | 1 | 51 | 0.3 | 132 | 4 | ... | 645 | 3763 | 16 | 1 | 4 | 1 | 0 | 1 | 3 | 1 |
| 27 | 956 | 0.0 | 0.50 | 0 | 1 | 1 | 41 | 1.0 | 143 | 7 | ... | 1075 | 3286 | 17 | 8 | 12 | 1 | 1 | 0 | 3 | 1 |
| 28 | 1453 | 0.0 | 1.60 | 1 | 12 | 1 | 52 | 0.3 | 96 | 2 | ... | 1311 | 2373 | 10 | 1 | 10 | 1 | 1 | 1 | 2 | 1 |
| 29 | 851 | 0.0 | 0.50 | 0 | 3 | 0 | 21 | 0.4 | 200 | 5 | ... | 1263 | 478 | 12 | 7 | 10 | 1 | 0 | 1 | 0 | 2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 70 | 1448 | 1.0 | 0.50 | 1 | 6 | 1 | 45 | 0.8 | 138 | 7 | ... | 1724 | 3378 | 13 | 11 | 2 | 1 | 1 | 1 | 3 | 1 |
| 71 | 1407 | 1.0 | 2.40 | 0 | 1 | 1 | 22 | 0.7 | 104 | 4 | ... | 1217 | 2192 | 9 | 7 | 13 | 1 | 0 | 1 | 2 | 2 |
| 72 | 605 | 0.0 | 1.00 | 0 | 8 | 1 | 9 | 0.1 | 142 | 3 | ... | 1026 | 392 | 15 | 6 | 4 | 1 | 0 | 0 | 0 | 1 |
| 73 | 1038 | 0.0 | 1.20 | 0 | 3 | 0 | 43 | 0.7 | 141 | 1 | ... | 638 | 3709 | 11 | 0 | 12 | 1 | 1 | 1 | 3 | 0 |
| 74 | 797 | 1.0 | 2.90 | 1 | 4 | 0 | 38 | 0.5 | 90 | 4 | ... | 1413 | 590 | 9 | 2 | 6 | 0 | 1 | 0 | 0 | 3 |
| 75 | 819 | 0.0 | 0.60 | 1 | 8 | 1 | 42 | 0.9 | 188 | 6 | ... | 1242 | 1814 | 5 | 4 | 13 | 1 | 0 | 1 | 1 | 2 |
| 76 | 1114 | 0.0 | 2.80 | 0 | 4 | 1 | 9 | 0.4 | 197 | 3 | ... | 1071 | 907 | 10 | 7 | 17 | 1 | 1 | 0 | 0 | 2 |
| 77 | 1234 | 1.0 | 1.60 | 1 | 1 | 0 | 33 | 0.6 | 172 | 1 | ... | 778 | 1449 | 11 | 7 | 6 | 0 | 0 | 1 | 0 | 0 |
| 78 | 1199 | 1.0 | 2.50 | 1 | 15 | 1 | 16 | 0.2 | 116 | 4 | ... | 1552 | 3448 | 17 | 13 | 10 | 1 | 0 | 1 | 3 | 1 |
| 79 | 1103 | 0.0 | 1.00 | 1 | 6 | 0 | 29 | 0.7 | 111 | 6 | ... | 1486 | 1837 | 8 | 7 | 16 | 0 | 1 | 1 | 1 | 3 |
| 80 | 1589 | 1.0 | 0.60 | 1 | 0 | 1 | 58 | 0.9 | 85 | 7 | ... | 1206 | 3464 | 19 | 10 | 6 | 1 | 1 | 1 | 3 | 1 |
| 81 | 999 | 0.0 | 2.90 | 1 | 11 | 1 | 64 | 0.2 | 199 | 4 | ... | 1616 | 2593 | 14 | 11 | 16 | 1 | 1 | 0 | 2 | 1 |
| 82 | 1510 | 1.0 | 0.90 | 1 | 2 | 0 | 45 | 0.9 | 180 | 5 | ... | 1752 | 3484 | 9 | 6 | 11 | 0 | 1 | 1 | 3 | 3 |
| 83 | 1008 | 0.0 | 0.80 | 0 | 11 | 0 | 61 | 1.0 | 114 | 3 | ... | 1122 | 2009 | 10 | 3 | 11 | 1 | 0 | 0 | 1 | 2 |
| 84 | 1127 | 1.0 | 2.90 | 1 | 5 | 1 | 57 | 0.8 | 163 | 1 | ... | 1049 | 2048 | 11 | 5 | 17 | 1 | 1 | 1 | 1 | 2 |
| 85 | 1412 | 1.0 | 2.40 | 0 | 5 | 0 | 25 | 0.8 | 96 | 8 | ... | 1894 | 837 | 15 | 9 | 4 | 0 | 0 | 1 | 1 | 0 |
| 86 | 1496 | 1.0 | 2.00 | 1 | 4 | 0 | 42 | 0.5 | 182 | 5 | ... | 741 | 854 | 16 | 0 | 7 | 0 | 1 | 1 | 0 | 0 |
| 87 | 1083 | 1.0 | 2.90 | 1 | 1 | 1 | 64 | 0.8 | 178 | 3 | ... | 1118 | 3210 | 13 | 3 | 4 | 1 | 0 | 0 | 3 | 2 |
| 88 | 668 | 0.0 | 0.50 | 1 | 0 | 0 | 3 | 0.1 | 155 | 5 | ... | 1203 | 2746 | 9 | 5 | 8 | 1 | 1 | 1 | 2 | 3 |
| 89 | 1309 | 0.0 | 1.10 | 1 | 0 | 0 | 33 | 0.5 | 100 | 4 | ... | 1402 | 2334 | 10 | 1 | 11 | 1 | 0 | 1 | 2 | 3 |
| 90 | 1724 | 0.0 | 2.00 | 1 | 2 | 1 | 57 | 0.5 | 177 | 3 | ... | 1924 | 2822 | 19 | 3 | 16 | 1 | 1 | 0 | 3 | 2 |
| 91 | 1977 | 1.0 | 2.00 | 1 | 7 | 1 | 54 | 1.0 | 171 | 7 | ... | 1242 | 1971 | 18 | 7 | 14 | 1 | 1 | 0 | 2 | 1 |
| 92 | 885 | 0.0 | 2.30 | 1 | 0 | 1 | 15 | 0.4 | 103 | 7 | ... | 802 | 1410 | 16 | 10 | 5 | 1 | 1 | 1 | 0 | 1 |
| 93 | 879 | 1.0 | 2.50 | 0 | 11 | 1 | 14 | 0.7 | 83 | 6 | ... | 1105 | 349 | 16 | 10 | 2 | 1 | 0 | 1 | 0 | 1 |
| 94 | 1322 | 0.0 | 1.70 | 1 | 6 | 0 | 7 | 0.8 | 140 | 3 | ... | 1990 | 1418 | 19 | 17 | 12 | 0 | 1 | 0 | 1 | 0 |
| 95 | 1137 | 1.0 | 1.00 | 0 | 18 | 0 | 7 | 1.0 | 196 | 3 | ... | 1179 | 3616 | 13 | 5 | 12 | 1 | 1 | 1 | 3 | 1 |
| 96 | 1355 | 0.0 | 2.30 | 0 | 10 | 1 | 23 | 0.2 | 132 | 5 | ... | 891 | 880 | 19 | 4 | 8 | 1 | 1 | 0 | 0 | 1 |
| 97 | 1665 | 1.0 | 0.50 | 0 | 3 | 1 | 60 | 0.2 | 194 | 6 | ... | 1684 | 1601 | 18 | 17 | 14 | 1 | 0 | 0 | 2 | 1 |
| 98 | 657 | 0.0 | 2.50 | 0 | 0 | 0 | 37 | 0.7 | 141 | 2 | ... | 961 | 1412 | 14 | 3 | 18 | 0 | 1 | 0 | 0 | 3 |
| 99 | 593 | 0.0 | 0.50 | 0 | 6 | 0 | 31 | 0.4 | 156 | 7 | ... | 1317 | 1692 | 6 | 2 | 7 | 1 | 0 | 0 | 1 | 2 |
100 rows × 22 columns
External Index : NMI and Adjusted Rand Score
from sklearn.metrics.cluster import normalized_mutual_info_score
normalized_mutual_info_score(dfMobile['price_range'], dfMobile['K-Means_Cluster_ID'])
0.003092508451085772
from sklearn.metrics.cluster import adjusted_rand_score
adjusted_rand_score(dfMobile['price_range'], dfMobile['K-Means_Cluster_ID'])
0.001231533143980699
Characteristic of Kmeans Cluster : Exploration of the Clusters found
plt.figure(figsize=(20,20),dpi=200)
columns_w_pr=['battery_power', 'bluetooth', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc','px_height','px_width','ram','sc_h','sc_w','talk_time','three_g','touch_screen','wifi','price_range']
for i,j in zip(range(1, 22, 1),(columns_w_pr)):
plt.subplot(7,4,i)
sns.boxplot(x='K-Means_Cluster_ID', y=j, data=dfMobile)
thank to this plot we can see the repartion of each features in each cluster and understand how kmeans cluster worked with our data
We used two hierachial clustering, one from spicy and one from sklearn to see the difference
df_data_scaled=pd.DataFrame(data_scaled,columns=['battery_power', 'bluetooth', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc','px_height','px_width','ram','sc_h','sc_w','talk_time','three_g','touch_screen','wifi'])
df_data_scaled.head()
| battery_power | bluetooth | clock_speed | dual_sim | fc | four_g | int_memory | m_dep | mobile_wt | n_cores | pc | px_height | px_width | ram | sc_h | sc_w | talk_time | three_g | touch_screen | wifi | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.902597 | -0.987083 | 0.830707 | -1.019184 | -0.762495 | -1.043966 | -1.380644 | 0.340740 | 1.349249 | -1.101971 | -1.305750 | -1.408949 | -1.146784 | 0.391703 | -0.784983 | 0.283103 | 1.462493 | -1.786861 | -1.006018 | 0.986097 |
| 1 | -0.495139 | 1.013086 | -1.256497 | 0.981177 | -0.992890 | 0.957886 | 1.155024 | 0.687548 | -0.120059 | -0.664768 | -0.645989 | 0.585778 | 1.704465 | 0.467317 | 1.114266 | -0.635317 | -0.734267 | 0.559641 | 0.994018 | -1.014099 |
| 2 | -1.537686 | 1.013086 | -1.256497 | 0.981177 | -0.532099 | 0.957886 | 0.493546 | 1.381165 | 0.134244 | 0.209639 | -0.645989 | 1.392684 | 1.074968 | 0.441498 | -0.310171 | -0.864922 | -0.368140 | 0.559641 | 0.994018 | -1.014099 |
| 3 | -1.419319 | 1.013086 | 1.199038 | -1.019184 | -0.992890 | -1.043966 | -1.215274 | 1.034357 | -0.261339 | 0.646842 | -0.151168 | 1.286750 | 1.236971 | 0.594569 | 0.876859 | 0.512708 | -0.002014 | 0.559641 | -1.006018 | -1.014099 |
| 4 | 1.325906 | 1.013086 | -0.397060 | -1.019184 | 2.002254 | 0.957886 | 0.658915 | 0.340740 | 0.021220 | -1.101971 | 0.673534 | 1.268718 | -0.091452 | -0.657666 | -1.022389 | -0.864922 | 0.730240 | 0.559641 | 0.994018 | -1.014099 |
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10,8))
ax = sns.heatmap(df_data_scaled.corr(), cmap='Greens', annot=True)
plt.title("Correlation Matrix", fontsize=20)
plt.show()
single linkage
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import cut_tree
plt.figure(figsize=(10,15))
sl_mergings = linkage(df_data_scaled, method="single", metric='euclidean')
dendrogram(sl_mergings)
plt.show()
complete linkage
# complete linkage
plt.figure(figsize=(30,30))
cl_mergings = linkage(df_data_scaled, method="complete", metric='euclidean')
dendrogram(cl_mergings)
plt.show()
cl_cluster_labels = cut_tree(cl_mergings, n_clusters=4).reshape(-1, )
cl_cluster_labels
array([0, 0, 0, ..., 0, 3, 2])
dfMobile['Hierarchical_Cluster_labels']=cl_cluster_labels
Characteristic of Hierachial Cluster : Exploration of the Clusters found
plt.figure(figsize=(20,20),dpi=200)
columns_w_pr=['battery_power', 'bluetooth', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc','px_height','px_width','ram','sc_h','sc_w','talk_time','three_g','touch_screen','wifi','price_range']
for i,j in zip(range(1, 22, 1),(columns_w_pr)):
plt.subplot(7,4,i)
sns.boxplot(x='Hierarchical_Cluster_labels', y=j, data=dfMobile)
from sklearn.cluster import AgglomerativeClustering
cluster_h = AgglomerativeClustering(n_clusters=4, affinity='euclidean', linkage='ward')
cluster_h.fit_predict(X)
cluster_h.labels_
array([0, 1, 3, ..., 3, 2, 2], dtype=int64)
Internal Index : Silhouette Score
from sklearn.metrics import silhouette_score
range_n_clusters = [2, 3, 4, 5, 6, 7, 8]
for num_clusters in range_n_clusters:
# intialise
cluster_hh = AgglomerativeClustering(n_clusters=num_clusters,affinity='euclidean', linkage='ward')
cluster_hh.fit(X)
cluster_hh_labels = cluster_h.labels_
# silhouette score
silhouette_avg = silhouette_score(X, cluster_hh_labels)
print("For n_clusters={0}, the silhouette score is {1}".format(num_clusters, silhouette_avg))
For n_clusters=2, the silhouette score is 0.2655857739743639 For n_clusters=3, the silhouette score is 0.2655857739743639 For n_clusters=4, the silhouette score is 0.2655857739743639 For n_clusters=5, the silhouette score is 0.2655857739743639 For n_clusters=6, the silhouette score is 0.2655857739743639 For n_clusters=7, the silhouette score is 0.2655857739743639 For n_clusters=8, the silhouette score is 0.2655857739743639
Comparaison of Kmeans and Hierarchial Clustering
data_pca_df = pd.DataFrame(data_pca,columns=['PC1','PC2','PC3','PC4'])
data_pca_df.head()
| PC1 | PC2 | PC3 | PC4 | |
|---|---|---|---|---|
| 0 | 0.293058 | -2.614122 | -0.583106 | 1.295868 |
| 1 | -1.976934 | 0.575289 | 0.371078 | -1.130758 |
| 2 | -1.269173 | 0.194788 | 0.074594 | -1.986688 |
| 3 | -1.087370 | -0.242464 | 1.628784 | -0.228918 |
| 4 | 1.647321 | 1.289198 | -0.363200 | -1.840295 |
data_pca_df['K_Means_Cluster_ID'] = kmeans.labels_
data_pca_df['Hierarchical_Cluster_Labels'] = cl_cluster_labels
data_pca_df['Hierarchical_Cluster_Labels_sklearn'] = cluster_h.labels_
plt.figure(figsize=(12,6),dpi=200)
plt.subplot(1,3,1)
sns.scatterplot(x='PC1',y='PC2',data=data_pca_df,hue='K_Means_Cluster_ID')
plt.subplot(1,3,3)
sns.scatterplot(x='PC1',y='PC2',data=data_pca_df,hue='Hierarchical_Cluster_Labels_sklearn')
<matplotlib.axes._subplots.AxesSubplot at 0x16a643e0438>
dfMobile['Hierarchical_Cluster_Labels_sklearn']=cluster_h.labels_
plt.figure(figsize=(15,10),dpi=200)
plt.subplot(1,3,1)
sns.scatterplot(x='ram',y='battery_power',data=dfMobile,hue='K-Means_Cluster_ID')
plt.subplot(1,3,2)
sns.scatterplot(x='ram',y='battery_power',data=dfMobile,hue='Hierarchical_Cluster_Labels_sklearn')
plt.subplot(1,3,3)
sns.scatterplot(x='ram',y='battery_power',data=dfMobile,hue='price_range')
<matplotlib.axes._subplots.AxesSubplot at 0x16a6dbceac8>
The results are not good at all.
That seems to make sense especially because there are all unsupervised algorithm and not all our features determined the price of Mobiles.
In fact, the mobile price doesn't determine the overall quality of a phone
Our features are non-lineair so PCA can't perform well on this dataset.
Let's try another dimensionality reduction method.
Let's try TNSE which can be utilized with non lineair features
from sklearn.manifold import TSNE
x_embedded = TSNE(n_components=2, init='random',perplexity=60.0,n_iter=1000,learning_rate=1.2).fit_transform(x)
from scipy.spatial.distance import cdist
distortions = []
inertias = []
K = range(1, 10)
for k in K:
# Building and fitting the model
kmeanModel = KMeans(n_clusters=k).fit(x_embedded)
kmeanModel.fit(x_embedded)
distortions.append(sum(np.min(cdist(x_embedded, kmeanModel.cluster_centers_,
'euclidean'), axis=1)) / x_embedded.shape[0])
inertias.append(kmeanModel.inertia_)
plt.figure(figsize=(10,6),dpi=200)
sns.scatterplot(x_embedded[:,0], x_embedded[:,1], hue=label, legend='full')
<matplotlib.axes._subplots.AxesSubplot at 0x1cb56129c18>
Elbow Method
plt.plot(K, inertias, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Inertia')
plt.title('The Elbow Method using Inertia')
plt.show()
# X=np.array(list(zip(x_embedded[:,0],x_embedded[:,1])))
colors= ['b','g','c','k']
markers= ['o','v','h','+']
plt.ylabel('T-SNE dimension')
Xkmeans = KMeans(n_clusters=4).fit(x_embedded)
plt.scatter(Xkmeans.cluster_centers_[:,0],Xkmeans.cluster_centers_[:,1],s=200,c='red',label='Centroids')
for i, l in enumerate(Xkmeans.labels_):
plt.plot(x_embedded[:,0][i],x_embedded[:,1][i],color=colors[l],marker=markers[l])
plt.xlabel(' T-SNE dimension ')
plt.legend()
plt.show()
External Index :Adjusted Rand Score
from sklearn.metrics.cluster import adjusted_rand_score
adjusted_rand_score(dfMobile['price_range'], Xkmeans.labels_)
0.43269019321563834
Using tsne with keamns seems to be a better idea than pca with kmeans
Compare distance for Kmeans
from pyclustering.cluster.kmeans import kmeans
from pyclustering.utils.metric import distance_metric
from pyclustering.cluster.center_initializer import random_center_initializer
from pyclustering.cluster.encoder import type_encoding
from pyclustering.cluster.encoder import cluster_encoder
from sklearn.metrics.cluster import contingency_matrix
distance_measures = {'euclidean': 0, 'squared euclidean': 1, 'manhattan': 2, 'chebyshev': 3,
'canberra': 5, 'chi-square': 6,}
def purity_score(y_true, y_pred):
#contingency matrix
confusion_matrix = contingency_matrix(y_true, y_pred)
return np.sum(np.amax(confusion_matrix, axis=0)) / np.sum(confusion_matrix)
def dist_compare(dist_measure):
initial_centers = random_center_initializer(x_embedded, 4, random_state=5).initialize()
# instance pour avoir distance differente
instanceKm = kmeans(x_embedded, initial_centers=initial_centers, metric=distance_metric(dist_measure))
# process cluster
instanceKm.process()
# cclusters et centers
pyClusters = instanceKm.get_clusters()
pyCenters = instanceKm.get_centers()
# encode
pyEncoding = instanceKm.get_cluster_encoding()
pyEncoder = cluster_encoder(pyEncoding, pyClusters, x_embedded)
pyLabels = pyEncoder.set_encoding(0).get_clusters()
#print(pyLabels)
return pyLabels#purity_score(y, pyLabels)
for measure, value in distance_measures.items():
from sklearn.metrics.cluster import adjusted_rand_score
print(measure)
print(adjusted_rand_score(dfMobile['price_range'], dist_compare(value)))
euclidean 0.43464680327301536 squared euclidean 0.43291598836715267 manhattan 0.4285617975628693 chebyshev 0.4315372049701328 canberra 0.3066896735611582 chi-square 0.31749946445265775
Changing distance method for Kmeans
from scipy.spatial import distance
x_seu=distance.cdist(x_embedded,x_embedded, 'sqeuclidean')
print(x_seu)
[[ 0. 191.23233639 240.57781097 ... 103.63924007 287.77230207
75.67688427]
[191.23233639 0. 2.99813657 ... 14.62967268 451.14992901
180.90951005]
[240.57781097 2.99813657 0. ... 29.15724822 511.88008515
212.75188353]
...
[103.63924007 14.62967268 29.15724822 ... 0. 404.33942657
100.30713749]
[287.77230207 451.14992901 511.88008515 ... 404.33942657 0.
632.34651025]
[ 75.67688427 180.90951005 212.75188353 ... 100.30713749 632.34651025
0. ]]
Position of Centroids
Xkmeans_seu = KMeans(n_clusters=4).fit(x_seu)
plt.scatter(Xkmeans_seu.cluster_centers_[:,0],Xkmeans_seu.cluster_centers_[:,1],s=200,c='red',label='Centroids')
<matplotlib.collections.PathCollection at 0x1cb61461710>
from sklearn.metrics.cluster import adjusted_rand_score
adjusted_rand_score(dfMobile['price_range'], Xkmeans_seu.labels_)
0.4743270263812373
Changing the distance method before applying Kmeans improves our adjusted rand score
1.Gower distance with Kmedoids
import gower
from sklearn_extra.cluster import KMedoids
gower_mat = gower.gower_matrix(x_seu)
km_model = KMedoids(n_clusters = 4, random_state = 0, metric = 'precomputed', method = 'pam', init = 'k-medoids++').fit(gower_mat)
clusters_kmedoids = km_model.labels_
from sklearn.metrics.cluster import adjusted_rand_score
adjusted_rand_score(dfMobile['price_range'], clusters_kmedoids)
0.42935679308294133
2.K-Prototype Algorithm
from kmodes.kprototypes import KPrototypes
dfMobileKPrototypes = dfMobile[['battery_power', 'bluetooth', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc','px_height','px_width','ram','sc_h','sc_w','talk_time','three_g','touch_screen','wifi']]
dfMobileKPrototypes["bluetooth"] = dfMobileKPrototypes["bluetooth"].astype(str)
dfMobileKPrototypes["bluetooth"] = dfMobileKPrototypes["bluetooth"].str.replace('0.0','')
dfMobileKPrototypes["bluetooth"] = dfMobileKPrototypes["bluetooth"].astype(bool)
dfMobileKPrototypes["dual_sim"] = dfMobileKPrototypes["dual_sim"].astype(str)
dfMobileKPrototypes["dual_sim"] = dfMobileKPrototypes["dual_sim"].str.replace('0','')
dfMobileKPrototypes["dual_sim"] = dfMobileKPrototypes["dual_sim"].astype(bool)
dfMobileKPrototypes["four_g"] = dfMobileKPrototypes["four_g"].astype(str)
dfMobileKPrototypes["four_g"] = dfMobileKPrototypes["four_g"].str.replace('0','')
dfMobileKPrototypes["four_g"] = dfMobileKPrototypes["four_g"].astype(bool)
dfMobileKPrototypes["three_g"] = dfMobileKPrototypes["three_g"].astype(str)
dfMobileKPrototypes["three_g"] = dfMobileKPrototypes["three_g"].str.replace('0','')
dfMobileKPrototypes["three_g"] = dfMobileKPrototypes["three_g"].astype(bool)
dfMobileKPrototypes["touch_screen"] = dfMobileKPrototypes["touch_screen"].astype(str)
dfMobileKPrototypes["touch_screen"] = dfMobileKPrototypes["touch_screen"].str.replace('0','')
dfMobileKPrototypes["touch_screen"] = dfMobileKPrototypes["touch_screen"].astype(bool)
dfMobileKPrototypes["wifi"] = dfMobileKPrototypes["wifi"].astype(str)
dfMobileKPrototypes["wifi"] = dfMobileKPrototypes["wifi"].str.replace('0','')
dfMobileKPrototypes["wifi"] = dfMobileKPrototypes["wifi"].astype(bool)
C:\Users\mayou\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy """Entry point for launching an IPython kernel. C:\Users\mayou\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy C:\Users\mayou\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy This is separate from the ipykernel package so we can avoid doing imports until C:\Users\mayou\Anaconda3\lib\site-packages\ipykernel_launcher.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy """ C:\Users\mayou\Anaconda3\lib\site-packages\ipykernel_launcher.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy C:\Users\mayou\Anaconda3\lib\site-packages\ipykernel_launcher.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy import sys C:\Users\mayou\Anaconda3\lib\site-packages\ipykernel_launcher.py:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy if __name__ == '__main__': C:\Users\mayou\Anaconda3\lib\site-packages\ipykernel_launcher.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy # Remove the CWD from sys.path while we load stuff. C:\Users\mayou\Anaconda3\lib\site-packages\ipykernel_launcher.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy # This is added back by InteractiveShellApp.init_path() C:\Users\mayou\Anaconda3\lib\site-packages\ipykernel_launcher.py:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy del sys.path[0] C:\Users\mayou\Anaconda3\lib\site-packages\ipykernel_launcher.py:14: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy C:\Users\mayou\Anaconda3\lib\site-packages\ipykernel_launcher.py:15: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy from ipykernel import kernelapp as app C:\Users\mayou\Anaconda3\lib\site-packages\ipykernel_launcher.py:17: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy C:\Users\mayou\Anaconda3\lib\site-packages\ipykernel_launcher.py:18: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy C:\Users\mayou\Anaconda3\lib\site-packages\ipykernel_launcher.py:19: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy C:\Users\mayou\Anaconda3\lib\site-packages\ipykernel_launcher.py:21: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy C:\Users\mayou\Anaconda3\lib\site-packages\ipykernel_launcher.py:22: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy C:\Users\mayou\Anaconda3\lib\site-packages\ipykernel_launcher.py:23: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
dfMobileKPrototypes['bluetooth'] = pd.Categorical(dfMobileKPrototypes.bluetooth)
dfMobileKPrototypes['dual_sim'] = pd.Categorical(dfMobileKPrototypes.dual_sim)
dfMobileKPrototypes['four_g'] = pd.Categorical(dfMobileKPrototypes.four_g)
dfMobileKPrototypes['three_g'] = pd.Categorical(dfMobileKPrototypes.three_g)
dfMobileKPrototypes['touch_screen'] = pd.Categorical(dfMobileKPrototypes.touch_screen)
dfMobileKPrototypes['wifi'] = pd.Categorical(dfMobileKPrototypes.wifi)
C:\Users\mayou\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy """Entry point for launching an IPython kernel. C:\Users\mayou\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy C:\Users\mayou\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy This is separate from the ipykernel package so we can avoid doing imports until C:\Users\mayou\Anaconda3\lib\site-packages\ipykernel_launcher.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy after removing the cwd from sys.path. C:\Users\mayou\Anaconda3\lib\site-packages\ipykernel_launcher.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy """ C:\Users\mayou\Anaconda3\lib\site-packages\ipykernel_launcher.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
dfMobileKPrototypes.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2000 entries, 0 to 1999 Data columns (total 20 columns): battery_power 2000 non-null int64 bluetooth 2000 non-null category clock_speed 2000 non-null float64 dual_sim 2000 non-null category fc 2000 non-null int64 four_g 2000 non-null category int_memory 2000 non-null int64 m_dep 2000 non-null float64 mobile_wt 2000 non-null int64 n_cores 2000 non-null int64 pc 2000 non-null int64 px_height 2000 non-null int64 px_width 2000 non-null int64 ram 2000 non-null int64 sc_h 2000 non-null int64 sc_w 2000 non-null int64 talk_time 2000 non-null int64 three_g 2000 non-null category touch_screen 2000 non-null category wifi 2000 non-null category dtypes: category(6), float64(2), int64(12) memory usage: 326.7 KB
categorical_features_idx=[1,3,5,17,18,19]
k_pro_array=dfMobileKPrototypes.values
kproto = KPrototypes(n_clusters=4, verbose=2, max_iter=20).fit(k_pro_array, categorical=categorical_features_idx)
Init: initializing centroids Init: initializing clusters Starting iterations... Run: 1, iteration: 1/20, moves: 365, ncost: 1335788196.7127144 Run: 1, iteration: 2/20, moves: 247, ncost: 1286766121.4449177 Run: 1, iteration: 3/20, moves: 132, ncost: 1267481368.001697 Run: 1, iteration: 4/20, moves: 74, ncost: 1261735189.019135 Run: 1, iteration: 5/20, moves: 37, ncost: 1259904287.0425026 Run: 1, iteration: 6/20, moves: 27, ncost: 1258768888.3110297 Run: 1, iteration: 7/20, moves: 17, ncost: 1258486836.0660462 Run: 1, iteration: 8/20, moves: 6, ncost: 1258409500.4406757 Run: 1, iteration: 9/20, moves: 1, ncost: 1258404843.3486428 Run: 1, iteration: 10/20, moves: 0, ncost: 1258404843.3486428 Init: initializing centroids Init: initializing clusters Starting iterations... Run: 2, iteration: 1/20, moves: 495, ncost: 1347022660.676937 Run: 2, iteration: 2/20, moves: 231, ncost: 1321745323.2697577 Run: 2, iteration: 3/20, moves: 147, ncost: 1295415351.381645 Run: 2, iteration: 4/20, moves: 170, ncost: 1269322520.004055 Run: 2, iteration: 5/20, moves: 110, ncost: 1261243941.646334 Run: 2, iteration: 6/20, moves: 49, ncost: 1259165260.7156408 Run: 2, iteration: 7/20, moves: 26, ncost: 1258649503.8514829 Run: 2, iteration: 8/20, moves: 15, ncost: 1258503294.5644262 Run: 2, iteration: 9/20, moves: 9, ncost: 1258403421.0886736 Run: 2, iteration: 10/20, moves: 2, ncost: 1258393897.3380303 Run: 2, iteration: 11/20, moves: 4, ncost: 1258370845.4224887 Run: 2, iteration: 12/20, moves: 0, ncost: 1258370845.4224887 Init: initializing centroids Init: initializing clusters Starting iterations... Run: 3, iteration: 1/20, moves: 465, ncost: 1365210718.8953986 Run: 3, iteration: 2/20, moves: 202, ncost: 1337759114.0471208 Run: 3, iteration: 3/20, moves: 101, ncost: 1327819230.735592 Run: 3, iteration: 4/20, moves: 67, ncost: 1322365486.1073427 Run: 3, iteration: 5/20, moves: 94, ncost: 1312785705.8145337 Run: 3, iteration: 6/20, moves: 105, ncost: 1298153750.1779969 Run: 3, iteration: 7/20, moves: 142, ncost: 1270557034.1945703 Run: 3, iteration: 8/20, moves: 107, ncost: 1257638952.5052345 Run: 3, iteration: 9/20, moves: 83, ncost: 1250737781.0568662 Run: 3, iteration: 10/20, moves: 47, ncost: 1248765416.249072 Run: 3, iteration: 11/20, moves: 27, ncost: 1248186171.5503166 Run: 3, iteration: 12/20, moves: 12, ncost: 1248017750.8871937 Run: 3, iteration: 13/20, moves: 2, ncost: 1248009623.6393702 Run: 3, iteration: 14/20, moves: 2, ncost: 1247999784.9239435 Run: 3, iteration: 15/20, moves: 0, ncost: 1247999784.9239435 Init: initializing centroids Init: initializing clusters Starting iterations... Run: 4, iteration: 1/20, moves: 354, ncost: 1331043990.7173636 Run: 4, iteration: 2/20, moves: 233, ncost: 1283662240.314904 Run: 4, iteration: 3/20, moves: 123, ncost: 1268515242.6091912 Run: 4, iteration: 4/20, moves: 90, ncost: 1262362737.70824 Run: 4, iteration: 5/20, moves: 34, ncost: 1261459484.4358892 Run: 4, iteration: 6/20, moves: 17, ncost: 1260993250.2814562 Run: 4, iteration: 7/20, moves: 12, ncost: 1260787735.101425 Run: 4, iteration: 8/20, moves: 7, ncost: 1260717911.4670916 Run: 4, iteration: 9/20, moves: 6, ncost: 1260612713.2721345 Run: 4, iteration: 10/20, moves: 7, ncost: 1260526881.91059 Run: 4, iteration: 11/20, moves: 3, ncost: 1260497811.8599546 Run: 4, iteration: 12/20, moves: 1, ncost: 1260494298.63101 Run: 4, iteration: 13/20, moves: 0, ncost: 1260494298.63101 Init: initializing centroids Init: initializing clusters Starting iterations... Run: 5, iteration: 1/20, moves: 427, ncost: 1351371961.8425052 Run: 5, iteration: 2/20, moves: 195, ncost: 1318403926.9686832 Run: 5, iteration: 3/20, moves: 106, ncost: 1307524358.4127038 Run: 5, iteration: 4/20, moves: 52, ncost: 1303469344.2493446 Run: 5, iteration: 5/20, moves: 44, ncost: 1300753909.940451 Run: 5, iteration: 6/20, moves: 29, ncost: 1299738107.5775242 Run: 5, iteration: 7/20, moves: 36, ncost: 1297862043.551715 Run: 5, iteration: 8/20, moves: 68, ncost: 1289691586.4498892 Run: 5, iteration: 9/20, moves: 92, ncost: 1274452695.448373 Run: 5, iteration: 10/20, moves: 97, ncost: 1259517693.635027 Run: 5, iteration: 11/20, moves: 81, ncost: 1251392901.7264695 Run: 5, iteration: 12/20, moves: 44, ncost: 1249100649.9282687 Run: 5, iteration: 13/20, moves: 16, ncost: 1248653540.0778313 Run: 5, iteration: 14/20, moves: 24, ncost: 1247805983.4954555 Run: 5, iteration: 15/20, moves: 14, ncost: 1247519507.505109 Run: 5, iteration: 16/20, moves: 8, ncost: 1247407148.0406885 Run: 5, iteration: 17/20, moves: 5, ncost: 1247384824.2974381 Run: 5, iteration: 18/20, moves: 0, ncost: 1247384824.2974381 Init: initializing centroids Init: initializing clusters Starting iterations... Run: 6, iteration: 1/20, moves: 539, ncost: 1336015847.1774583 Run: 6, iteration: 2/20, moves: 209, ncost: 1315672647.849708 Run: 6, iteration: 3/20, moves: 89, ncost: 1310923194.070927 Run: 6, iteration: 4/20, moves: 28, ncost: 1310281337.8034508 Run: 6, iteration: 5/20, moves: 23, ncost: 1309846486.9018633 Run: 6, iteration: 6/20, moves: 16, ncost: 1309491826.7627916 Run: 6, iteration: 7/20, moves: 33, ncost: 1308047589.1240118 Run: 6, iteration: 8/20, moves: 38, ncost: 1306627881.3469722 Run: 6, iteration: 9/20, moves: 53, ncost: 1303574579.3726957 Run: 6, iteration: 10/20, moves: 64, ncost: 1298477779.143969 Run: 6, iteration: 11/20, moves: 54, ncost: 1294483824.8532133 Run: 6, iteration: 12/20, moves: 64, ncost: 1288812389.078308 Run: 6, iteration: 13/20, moves: 65, ncost: 1282305131.9629664 Run: 6, iteration: 14/20, moves: 94, ncost: 1269472487.2032847 Run: 6, iteration: 15/20, moves: 123, ncost: 1254449833.4702075 Run: 6, iteration: 16/20, moves: 85, ncost: 1249132236.6480777 Run: 6, iteration: 17/20, moves: 32, ncost: 1248257690.7254918 Run: 6, iteration: 18/20, moves: 14, ncost: 1248046291.8838475 Run: 6, iteration: 19/20, moves: 7, ncost: 1247999784.9239435 Run: 6, iteration: 20/20, moves: 0, ncost: 1247999784.9239435 Init: initializing centroids Init: initializing clusters Starting iterations... Run: 7, iteration: 1/20, moves: 452, ncost: 1290124169.8626802 Run: 7, iteration: 2/20, moves: 200, ncost: 1266045360.2812688 Run: 7, iteration: 3/20, moves: 65, ncost: 1262833258.7834222 Run: 7, iteration: 4/20, moves: 38, ncost: 1261738082.504289 Run: 7, iteration: 5/20, moves: 25, ncost: 1261293095.0132117 Run: 7, iteration: 6/20, moves: 16, ncost: 1261048861.4519992 Run: 7, iteration: 7/20, moves: 6, ncost: 1260953545.674998 Run: 7, iteration: 8/20, moves: 6, ncost: 1260861189.7837906 Run: 7, iteration: 9/20, moves: 4, ncost: 1260817579.1312745 Run: 7, iteration: 10/20, moves: 7, ncost: 1260705495.4245546 Run: 7, iteration: 11/20, moves: 10, ncost: 1260520275.2959063 Run: 7, iteration: 12/20, moves: 5, ncost: 1260490753.7292767 Run: 7, iteration: 13/20, moves: 0, ncost: 1260490753.7292767 Init: initializing centroids Init: initializing clusters Starting iterations... Run: 8, iteration: 1/20, moves: 476, ncost: 1294229202.8642166 Run: 8, iteration: 2/20, moves: 207, ncost: 1263450317.7044842 Run: 8, iteration: 3/20, moves: 58, ncost: 1260571987.7885466 Run: 8, iteration: 4/20, moves: 37, ncost: 1258698946.9572828 Run: 8, iteration: 5/20, moves: 18, ncost: 1258423386.1546617 Run: 8, iteration: 6/20, moves: 4, ncost: 1258404843.3486428 Run: 8, iteration: 7/20, moves: 0, ncost: 1258404843.3486428 Init: initializing centroids Init: initializing clusters Starting iterations... Run: 9, iteration: 1/20, moves: 366, ncost: 1279488546.7664633 Run: 9, iteration: 2/20, moves: 127, ncost: 1265214135.2260005 Run: 9, iteration: 3/20, moves: 55, ncost: 1262841584.4683874 Run: 9, iteration: 4/20, moves: 25, ncost: 1262200655.0264049 Run: 9, iteration: 5/20, moves: 14, ncost: 1262093546.084259 Run: 9, iteration: 6/20, moves: 9, ncost: 1262027573.350804 Run: 9, iteration: 7/20, moves: 4, ncost: 1261999319.125979 Run: 9, iteration: 8/20, moves: 3, ncost: 1261964852.421694 Run: 9, iteration: 9/20, moves: 2, ncost: 1261957564.1233125 Run: 9, iteration: 10/20, moves: 0, ncost: 1261957564.1233125 Init: initializing centroids Init: initializing clusters Starting iterations... Run: 10, iteration: 1/20, moves: 557, ncost: 1303173461.2583773 Run: 10, iteration: 2/20, moves: 220, ncost: 1270647801.5480719 Run: 10, iteration: 3/20, moves: 115, ncost: 1260457065.8079798 Run: 10, iteration: 4/20, moves: 63, ncost: 1257866954.3915002 Run: 10, iteration: 5/20, moves: 35, ncost: 1256813111.2437422 Run: 10, iteration: 6/20, moves: 34, ncost: 1255308959.0691562 Run: 10, iteration: 7/20, moves: 29, ncost: 1253401689.5284185 Run: 10, iteration: 8/20, moves: 33, ncost: 1250753209.2096863 Run: 10, iteration: 9/20, moves: 30, ncost: 1248951508.867046 Run: 10, iteration: 10/20, moves: 24, ncost: 1248125092.2691853 Run: 10, iteration: 11/20, moves: 9, ncost: 1248019187.7005372 Run: 10, iteration: 12/20, moves: 3, ncost: 1247999784.9239435 Run: 10, iteration: 13/20, moves: 0, ncost: 1247999784.9239435 Best run was number 5
clusters = kproto.predict(k_pro_array, categorical=categorical_features_idx)
dfMobileKPrototypes['cluster'] = list(clusters)
C:\Users\mayou\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy """Entry point for launching an IPython kernel.
dfMobileKPrototypes.head(5)
| battery_power | bluetooth | clock_speed | dual_sim | fc | four_g | int_memory | m_dep | mobile_wt | n_cores | ... | px_height | px_width | ram | sc_h | sc_w | talk_time | three_g | touch_screen | wifi | cluster | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Index | |||||||||||||||||||||
| 0 | 842 | False | 2.2 | False | 1 | False | 7 | 0.6 | 188 | 2 | ... | 20 | 756 | 2549 | 9 | 7 | 19 | False | False | True | 3 |
| 1 | 1021 | True | 0.5 | True | 0 | True | 53 | 0.7 | 136 | 3 | ... | 905 | 1988 | 2631 | 17 | 3 | 7 | True | True | False | 2 |
| 2 | 563 | True | 0.5 | True | 2 | True | 41 | 0.9 | 145 | 5 | ... | 1263 | 1716 | 2603 | 11 | 2 | 9 | True | True | False | 2 |
| 3 | 615 | True | 2.5 | False | 0 | False | 10 | 0.8 | 131 | 6 | ... | 1216 | 1786 | 2769 | 16 | 8 | 11 | True | False | False | 2 |
| 4 | 1821 | True | 1.2 | False | 13 | True | 44 | 0.6 | 141 | 2 | ... | 1208 | 1212 | 1411 | 8 | 2 | 15 | True | True | False | 2 |
5 rows × 21 columns
from sklearn.metrics.cluster import adjusted_rand_score
adjusted_rand_score(dfMobile['price_range'], clusters)
0.4666108067332902
3.Spectral Clustering with tsne
from sklearn.cluster import SpectralClustering
sc = SpectralClustering( affinity='rbf', n_clusters=4).fit(x_embedded)
labels = sc.labels_
plt.scatter(x_embedded[:,0], x_embedded[:,1], c=labels)
plt.show()
from sklearn.metrics.cluster import adjusted_rand_score
adjusted_rand_score(dfMobile['price_range'], sc.labels_)
0.2897263721763855